0.1 Setup

suppressPackageStartupMessages({
  # General use
  library(tidyverse)
  library(knitr)
  library(magrittr)
  # Data formats
  library(animalcules)
  library(MultiAssayExperiment)
  library(SummarizedExperiment)
  # Modeling
  library(lme4)
  library(geepack)
})
## No methods found in package 'timeDate' for request: '[<-' when loading 'timeSeries'

0.2 Initial Data Extraction

# Read in MAE of data
OG_dat <- readRDS("data/animalculesFinalHIV.rds")
dat <- OG_dat[["MicrobeGenetics"]]

# Extract metadata for infants only
microbe <- dat[, which(dat$MothChild == "Infant")]
sam_table <- as.data.frame(colData(microbe))

# Both mothers and infants
microbe_mi <- dat
tax_table <- as.data.frame(SummarizedExperiment::rowData(microbe))

# Extract metadata, taxonomic info, and counts
tax_table_mi <- as.data.frame(SummarizedExperiment::rowData(microbe_mi))
sam_table_mi <- as.data.frame(SummarizedExperiment::colData(microbe_mi))
counts_table_mi <- as.data.frame(SummarizedExperiment::assay(
  microbe_mi, "MGX"))[, rownames(sam_table_mi)]

0.3 Statistics

The following statistics are calculated specifically for the infants in this study (HIV-E vs control infants).

0.3.1 Information for mothers’ and infants’ samples (using raw reads)

sam_table_mi %>%
  filter(MothChild == "Infant") %>%
  dplyr::group_by(timepoint) %>%
  dplyr::summarise("Average Infant Age" = round(mean(Age)),
                   "SD of Infant Age" = round(sd(Age)))
## # A tibble: 7 × 3
##   timepoint `Average Infant Age` `SD of Infant Age`
##       <int>                <dbl>              <dbl>
## 1         0                    7                  1
## 2         1                   22                  2
## 3         2                   42                  3
## 4         3                   58                  2
## 5         4                   74                  2
## 6         5                   88                  2
## 7         6                  105                  2
sam_table_mi %>%
  dplyr::rename("HIV Status" = HIVStatus) %>%
  mutate("Timepoint" = factor(timepoint)) %>%
  table1::table1(~ Timepoint + `HIV Status` | factor(MothChild),
               data = .)
Infant
(N=129)
Mother
(N=38)
Overall
(N=167)
Timepoint
0 20 (15.5%) 20 (52.6%) 40 (24.0%)
1 20 (15.5%) 0 (0%) 20 (12.0%)
2 19 (14.7%) 0 (0%) 19 (11.4%)
3 17 (13.2%) 0 (0%) 17 (10.2%)
4 17 (13.2%) 0 (0%) 17 (10.2%)
5 18 (14.0%) 0 (0%) 18 (10.8%)
6 18 (14.0%) 18 (47.4%) 36 (21.6%)
HIV Status
Control 68 (52.7%) 19 (50.0%) 87 (52.1%)
HIV 61 (47.3%) 19 (50.0%) 80 (47.9%)

Looking at reads

howmanyreads <- function(mothinf) {
    counts_table_mi %>%
    as_tibble() %>%
    mutate(species = tax_table_mi$species) %>%
    relocate(species) %>%
    pivot_longer(cols = starts_with("X"),
                 names_to = "Sample", values_to = "Abundance") %>%
    left_join(., sam_table_mi, by = "Sample") %>%
    # summarize
    filter(MothChild == mothinf) %>%
    group_by(Sample) %>%
    dplyr::summarise("Total Reads" = sum(`Abundance`)) %>%
    arrange(desc(`Total Reads`)) %>%
    dplyr::summarise(med_reads = median(`Total Reads`),
                     mean_reads = mean(`Total Reads`),
                     sd_reads = sd(`Total Reads`),
                     min_reads = min(`Total Reads`),
                     max_reads = max(`Total Reads`),
                     num_total = n())
}
howmanyreads("Mother")
## # A tibble: 1 × 6
##   med_reads mean_reads sd_reads min_reads max_reads num_total
##       <dbl>      <dbl>    <dbl>     <dbl>     <dbl>     <int>
## 1    54298.     70463.   48730.     14339    208029        38
howmanyreads("Infant")
## # A tibble: 1 × 6
##   med_reads mean_reads sd_reads min_reads max_reads num_total
##       <dbl>      <dbl>    <dbl>     <dbl>     <dbl>     <int>
## 1     65069    124905.  299120.     13218   3016276       129
sam_table %>%
  group_by(timepoint) %>%
  dplyr::summarise(median(`Age`))
## # A tibble: 7 × 2
##   timepoint `median(Age)`
##       <int>         <dbl>
## 1         0             6
## 2         1            22
## 3         2            42
## 4         3            58
## 5         4            74
## 6         5            88
## 7         6           104
# Number of mother samples
# NUmber of samples' reads

0.4 Percent abundances of taxa

create_long <- function(input_df, sam_tab = sam_table_mi,
                        tax_tab = tax_table_mi) {
  input_df %>%
    as_tibble() %>%
    mutate(species = tax_table_mi$species) %>%
    relocate(species) %>%
    pivot_longer(cols = starts_with("X"),
                 names_to = "Sample", values_to = "Abundance") %>%
    left_join(., sam_tab, by = "Sample") %>%
    left_join(., tax_tab, by = "species") %>%
    relocate(colnames(sam_tab))
}

alluvial_df <- create_long(counts_table_mi)

0.4.1 Abundances of different taxons

# Non-limited to any groupings, all samples!
alluvial_df %>%
  group_by(genus) %>%
  summarise(pct = round(sum(Abundance)/sum(alluvial_df$Abundance)*100, 4)) %>%
  arrange(desc(pct))
## # A tibble: 648 × 2
##    genus               pct
##    <chr>             <dbl>
##  1 Corynebacterium    3.38
##  2 Bacillus           2.78
##  3 Pseudomonas        2.18
##  4 Mycolicibacterium  1.81
##  5 Streptomyces       1.69
##  6 Acinetobacter      1.59
##  7 Staphylococcus     1.55
##  8 Paracoccus         1.45
##  9 Bradyrhizobium     1.33
## 10 Nocardioides       1.09
## # … with 638 more rows
alluvial_df %>%
  group_by(phylum) %>%
  summarise(pct = round(sum(Abundance)/sum(alluvial_df$Abundance)*100, 4)) %>%
  arrange(desc(pct))
## # A tibble: 17 × 2
##    phylum                  pct
##    <chr>                 <dbl>
##  1 Proteobacteria      37.5   
##  2 Actinobacteria      26.4   
##  3 Firmicutes          26.0   
##  4 Bacteroidetes        7.14  
##  5 Chloroflexi          0.847 
##  6 Fusobacteria         0.606 
##  7 Nitrospirae          0.363 
##  8 Acidobacteria        0.242 
##  9 Cyanobacteria        0.242 
## 10 Deinococcus-Thermus  0.242 
## 11 Planctomycetes       0.242 
## 12 Gemmatimonadetes     0.121 
## 13 Synergistetes        0.121 
## 14 Tenericutes          0.0005
## 15 Armatimonadetes      0     
## 16 Spirochaetes         0     
## 17 Verrucomicrobia      0

0.5 Beta Diversity Testing

0.5.1 Genus - Infant

# Genus
diversity_beta_test(subsetByColData(OG_dat,
                               which(OG_dat$MothChild == "Infant" &
                                     OG_dat$timepoint == 0)),
                    tax_level = "genus",
                    input_beta_method = "bray", #jaccard
                    input_select_beta_condition = "HIVStatus",
                    input_select_beta_stat_method = "Wilcoxon rank sum test")
##         HIV and between                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.33452417495224"                                 
##         Control and between                                
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.100432908365082"                                
##         Control and HIV                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.678599395713093"
diversity_beta_test(subsetByColData(OG_dat,
                               which(OG_dat$MothChild == "Infant" &
                                     OG_dat$timepoint == 6)),
                    tax_level = "genus",
                    input_beta_method = "bray", #jaccard
                    input_select_beta_condition = "HIVStatus",
                    input_select_beta_stat_method = "Wilcoxon rank sum test")
##         HIV and between                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.20814402240231"                                 
##         Control and between                                
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "1.05307939650421e-06"                             
##         Control and HIV                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.00175509727592689"
#animalcules::diversity_beta_boxplot(subsetByColData(OG_dat,
#                               which(OG_dat$MothChild == "Infant")),
#                    tax_level = "genus",
#                    input_beta_method = "bray", #jaccard
#                    input_select_beta_condition = "HIVStatus")

0.5.2 Genus - Mother

# Genus
diversity_beta_test(subsetByColData(OG_dat,
                               which(OG_dat$MothChild == "Mother" &
                                     OG_dat$timepoint == 0)),
                    tax_level = "genus",
                    input_beta_method = "bray", #jaccard 
                    input_select_beta_condition = "HIVStatus",
                    input_select_beta_stat_method = "Wilcoxon rank sum test")
##         HIV and between                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.826219946602891"                                
##         Control and between                                
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.419719970507453"                                
##         Control and HIV                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.700208637633154"
diversity_beta_test(subsetByColData(OG_dat,
                               which(OG_dat$MothChild == "Mother" &
                                     OG_dat$timepoint == 6)),
                    tax_level = "genus",
                    input_beta_method = "bray", #jaccard 
                    input_select_beta_condition = "HIVStatus",
                    input_select_beta_stat_method = "Wilcoxon rank sum test")
##         Control and between                                
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "5.85363786404608e-07"                             
##         HIV and between                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.867001372812487"                                
##         HIV and Control                                    
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.000101538359001655"
#animalcules::diversity_beta_boxplot(subsetByColData(OG_dat,
#                               which(OG_dat$MothChild == "Mother")),
#                    tax_level = "genus",
#                    input_beta_method = "bray", #jaccard
#                    input_select_beta_condition = "HIVStatus")

0.5.3 Genus - HEU infants t=0 vs t=6

# Genus
diversity_beta_test(subsetByColData(OG_dat,
                               which(OG_dat$MothChild == "Infant" &
                                     OG_dat$HIVStatus == "HIV" &
                                     OG_dat$timepoint %in% c(0, 6))),
                    tax_level = "genus",
                    input_beta_method = "bray", #jaccard 
                    input_select_beta_condition = "timepoint",
                    input_select_beta_stat_method = "Wilcoxon rank sum test")
##         6 and between                                      
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "1.38490591817349e-20"                             
##         0 and between                                      
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "2.21076524628771e-17"                             
##         0 and 6                                            
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.114781658698142"
#animalcules::diversity_beta_boxplot(subsetByColData(OG_dat,
#                               which(OG_dat$MothChild == "Mother")),
#                    tax_level = "genus",
#                    input_beta_method = "bray", #jaccard
#                    input_select_beta_condition = "HIVStatus")

0.5.4 Genus - HUU infants t=0 vs t=6

diversity_beta_test(subsetByColData(OG_dat,
                               which(OG_dat$MothChild == "Infant" &
                                     OG_dat$HIVStatus == "Control" &
                                     OG_dat$timepoint %in% c(0, 6))),
                    tax_level = "genus",
                    input_beta_method = "bray", #jaccard 
                    input_select_beta_condition = "timepoint",
                    input_select_beta_stat_method = "Wilcoxon rank sum test")
##         6 and between                                      
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "2.28981697925407e-28"                             
##         0 and between                                      
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "5.69660947216888e-29"                             
##         0 and 6                                            
## Method  "Wilcoxon rank sum test with continuity correction"
## P-value "0.781417368229006"

0.6 Plotting

0.6.1 Obtain most abundant genera

obtain_gn <- function(counts_tab, tax_tab) {
  all_relabu_genus <- counts_tab |>
    as.matrix() |>
    # Get rel abu within samokes
    prop.table(margin = 2) |>
    as_tibble() |>
    bind_cols(genus = tax_tab$genus) |>
    relocate(genus) |>
    group_by(genus) |>
    # Sum rel abu within samples/columns for genera
    summarise(across(.fns = sum)) %>%
    # Sum everything but the first columm ("genus")
    mutate(allmeans = apply(.[,-1], 1, mean)) |>
    select(genus, allmeans) |>
    mutate(genus = replace(genus, is.na(genus), "Unknown")) |>
    arrange(desc(allmeans))
  
  all_relabu_genus |>
    select(genus) |> unlist() |> unname() %>% .[1:20] %>%
    return()
}

ind <- microbe_mi$MothChild == "Infant"
best_genus <- obtain_gn(counts_table_mi[, ind], tax_table)
ind <- microbe_mi$MothChild == "Mother"
best_genus_m <- obtain_gn(counts_table_mi[, ind], tax_table_mi)

0.6.2 Infants

# Barplot for Control
p <- relabu_barplot(subsetByColData(OG_dat,
                               which(OG_dat$HIVStatus == "Control" &
                                       OG_dat$MothChild == "Infant")),
               tax_level = "genus",
               order_organisms = best_genus[1:20],
               sort_by = "conditions",
               group_samples = TRUE,
               group_conditions = "timepoint",
               show_legend = TRUE)

#plotly::export(p, file = "PaperFigs/RAW_barplot_control_inf.png",
#                   vwidth = 750, vheight = 550)
p
# Barplot for Control
p <- relabu_barplot(subsetByColData(OG_dat,
                               which(OG_dat$HIVStatus == "HIV" &
                                       OG_dat$MothChild == "Infant")),
               tax_level = "genus",
               order_organisms = best_genus[1:20],
               sort_by = "conditions",
               group_samples = TRUE,
               group_conditions = "timepoint",
               show_legend = TRUE)

#plotly::export(p, file = "PaperFigs/RAW_barplot_HIV_inf.png",
#                   vwidth = 750, vheight = 550)
p

0.6.3 Mothers

# Barplot for Control
p <- relabu_barplot(subsetByColData(OG_dat,
                               which(OG_dat$HIVStatus == "Control" &
                                       OG_dat$MothChild == "Mother")),
               tax_level = "genus",
               order_organisms = best_genus_m[1:20],
               sort_by = "conditions",
               group_samples = TRUE,
               group_conditions = "timepoint",
               show_legend = TRUE)

#plotly::export(p, file = "PaperFigs/RAW_barplot_control_mom.png",
#                   vwidth = 750, vheight = 550)
p
# Barplot for Control
p <- relabu_barplot(subsetByColData(OG_dat,
                               which(OG_dat$HIVStatus == "HIV" &
                                       OG_dat$MothChild == "Mother")),
               tax_level = "genus",
               order_organisms = best_genus_m[1:20],
               sort_by = "conditions",
               group_samples = TRUE,
               group_conditions = "timepoint",
               show_legend = TRUE)

#plotly::export(p, file = "PaperFigs/RAW_barplot_HIV_mom.png",
#                   vwidth = 750, vheight = 550)
p